R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(Metrics)
library(readr)
library(ggplot2)#for visualisation
library(corrplot)#for visualisation of correlation
## corrplot 0.92 loaded
library(mlbench) 
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(plotly)#converting ggplot to plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
library(lattice)
library(caret)
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
library(caTools)#for splittind data into testing and training data
library(dplyr) #manipulating dataframe
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mlbench)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(corrplot)
library(catboost)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
data("BostonHousing")


housing <- BostonHousing

dim(housing)
## [1] 506  14
str(housing)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
#Check for any NA’s in the dataframe.
missmap(housing,col=c('yellow','black'),y.at=1,y.labels='',legend=TRUE)

#Checking for NA and missing values and removing them.
numberOfNA <- length(which(is.na(housing)==T))
numberOfNA
## [1] 0
# Remove NA values
housing <- housing %>%
        drop_na()

str(housing) #gives the structure of data
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
dim(housing)
## [1] 506  14
#Let’s convert ‘chas’ to numeric
housing$chas <- as.numeric(housing$chas)
str(housing$chas)
##  num [1:506] 1 1 1 1 1 1 1 1 1 1 ...
head(housing)#returns the first six rows of data 
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio      b lstat
## 1 0.00632 18  2.31    1 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    1 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    1 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    1 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    1 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    1 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
summary(housing)  #gives the basic statistics of your dataset like mean, median, 1st quartile, 2nd quartile etc.
##       crim                zn             indus            chas      
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :1.000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:1.000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :1.000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :1.069  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:1.000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :2.000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio            b         
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
#In the above image we can see that variable ‘crim’ and ‘b’ take wide range of values.

#Variables ‘crim’, ‘zn’, ‘rm’ and ‘black’ have a large difference between their median and mean which indicates lot of outliers in respective variables.

par(mfrow = c(1, 4))
boxplot(housing$crim, main='crim',col='Sky Blue')
boxplot(housing$zn, main='zn',col='Sky Blue')
boxplot(housing$rm, main='rm',col='Sky Blue')
boxplot(housing$b, main='b',col='Sky Blue')

#As suggested earlier variables ‘crim’, ‘zn’, ‘rm’ and ‘black’ do have a lot of outliers.


#Finding correlation
#Correlation is a statistical measure that suggests the level of linear dependence between two variables that occur in pair. Its value lies between -1 to +1

#If above 0 it means positive correlation i.e. X is directly proportional to Y.
#If below 0 it means negative correlation i.e. X is inversly proportional to Y.
#Value 0 suggests weak relation.
#Usually we would use the function ‘cor’ to find correlation between two variables, but since we have 14 variables here, it is easier to examine the correlation between different varables using corrplot function in library ‘corrplot’.

#Correlation plots are a great way of exploring data and seeing the level of interaction between the variables.


corrplot(cor(housing))

#Looking at the plots, we can see that (except for the diagnal):
#Attributes like ‘tax and rad’, ‘nox and tax’, ‘age and indus’ have positive correlation. Larger darker blue dots suggest storng positive relationship.
#Attributes like ‘dis and nox’, ‘dis and indus’, ‘age and dis’ have negative correlation. Larger darker red dots suggest storng negative relationship.

# Highly correlated variables
correlated <- cor(housing)
highCorr <- findCorrelation(correlated, cutoff=0.70)
highCorr
## [1]  3 10  5 13  8
names(housing[highCorr])
## [1] "indus" "tax"   "nox"   "lstat" "dis"

#Let’s split the loaded dataset into train and test sets. We will use 75% of the data to train our models and 15% will be used to test the models..

set.seed(120)
split <- sample.split(housing,SplitRatio =0.75)
train <- subset(housing,split==TRUE)
test <- subset(housing,split==FALSE)

dim(housing)
## [1] 506  14
dim(train)
## [1] 362  14
dim(test)
## [1] 144  14
summary(train)
##       crim                zn            indus            chas      
##  Min.   : 0.00632   Min.   :  0.0   Min.   : 0.46   Min.   :1.000  
##  1st Qu.: 0.08276   1st Qu.:  0.0   1st Qu.: 5.13   1st Qu.:1.000  
##  Median : 0.25636   Median :  0.0   Median : 9.69   Median :1.000  
##  Mean   : 3.52093   Mean   : 11.2   Mean   :11.05   Mean   :1.061  
##  3rd Qu.: 3.51983   3rd Qu.:  0.0   3rd Qu.:18.10   3rd Qu.:1.000  
##  Max.   :88.97620   Max.   :100.0   Max.   :27.74   Max.   :2.000  
##       nox               rm             age              dis        
##  Min.   :0.3890   Min.   :3.561   Min.   :  2.90   Min.   : 1.137  
##  1st Qu.:0.4490   1st Qu.:5.895   1st Qu.: 42.45   1st Qu.: 2.076  
##  Median :0.5380   Median :6.213   Median : 77.75   Median : 3.191  
##  Mean   :0.5535   Mean   :6.305   Mean   : 68.03   Mean   : 3.803  
##  3rd Qu.:0.6240   3rd Qu.:6.640   3rd Qu.: 94.10   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio           b         
##  Min.   : 1.000   Min.   :187.0   Min.   :12.6   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.4   1st Qu.:375.24  
##  Median : 5.000   Median :329.0   Median :19.1   Median :391.70  
##  Mean   : 9.378   Mean   :405.8   Mean   :18.5   Mean   :357.90  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.2   3rd Qu.:396.90  
##  Max.   :24.000   Max.   :711.0   Max.   :22.0   Max.   :396.90  
##      lstat             medv      
##  Min.   : 1.730   Min.   : 5.00  
##  1st Qu.: 6.728   1st Qu.:16.80  
##  Median :10.685   Median :21.40  
##  Mean   :12.315   Mean   :22.67  
##  3rd Qu.:16.402   3rd Qu.:25.27  
##  Max.   :37.970   Max.   :50.00
model <- lm(medv ~ lstat , data = train) # fit a simple linear regression model
summary(model)
## 
## Call:
## lm(formula = medv ~ lstat, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.071  -4.154  -1.475   2.202  24.629 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.61596    0.67468   51.31   <2e-16 ***
## lstat       -0.97012    0.04781  -20.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.268 on 360 degrees of freedom
## Multiple R-squared:  0.5335, Adjusted R-squared:  0.5322 
## F-statistic: 411.7 on 1 and 360 DF,  p-value: < 2.2e-16
pSimpleLineer <- predict(model,test)
postResample(pSimpleLineer,test$medv)
##      RMSE  Rsquared       MAE 
## 6.1010062 0.5741069 4.1948060
plLinearSimple <-test %>% 
  ggplot(aes(medv,pSimpleLineer)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of medv') +
  ylab('Predicted value of medv')+
  theme_bw()

ggplotly(plLinearSimple)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Multiple Linear Regression
#Lets build our model considering that crim,rm,tax,lstat as the major influencers on the target variable.
model2 <- lm(medv ~ crim + rm + tax + lstat , data = train)
summary(model2)
## 
## Call:
## lm(formula = medv ~ crim + rm + tax + lstat, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.977  -3.481  -1.176   1.871  30.418 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.110293   3.571261  -0.031   0.9754    
## crim        -0.081763   0.038589  -2.119   0.0348 *  
## rm           5.070090   0.493432  10.275   <2e-16 ***
## tax         -0.004923   0.002282  -2.158   0.0316 *  
## lstat       -0.560441   0.059535  -9.414   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.465 on 357 degrees of freedom
## Multiple R-squared:  0.6483, Adjusted R-squared:  0.6444 
## F-statistic: 164.5 on 4 and 357 DF,  p-value: < 2.2e-16
model2
## 
## Call:
## lm(formula = medv ~ crim + rm + tax + lstat, data = train)
## 
## Coefficients:
## (Intercept)         crim           rm          tax        lstat  
##   -0.110293    -0.081763     5.070090    -0.004923    -0.560441
pMultipleLm <- predict(model2,test)
postResample(pMultipleLm,test$medv)
##      RMSE  Rsquared       MAE 
## 5.4726430 0.6579667 3.6944289
plotMultipleLineer <-test %>% 
  ggplot(aes(medv,pMultipleLm)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of Price') +
  ylab('Predicted value of Price')+
  theme_bw()

ggplotly(plotMultipleLineer)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Remove Highly Correlated Variables and build models
# Create new dataset w/o highly correlated variables
corr_data <- train[,-highCorr]

model3 <- lm(medv ~ . , data = corr_data)

pSimpleLineer3 <- predict(model3,test)
postResample(pSimpleLineer3,test$medv)
##      RMSE  Rsquared       MAE 
## 5.3167942 0.6726429 3.3492280
#Cross validation
x <- data.matrix(train[,1:13])
y <- train$medv


control <- trainControl(method = "cv",
                        number = 10)

lineerCV <- train(medv~.,
                data = train,
                method = "lm",
                trControl = control )


pLineerCV <- predict(lineerCV,test)
postResample(pLineerCV,test$medv)
##      RMSE  Rsquared       MAE 
## 4.7853860 0.7395806 3.1963155
#LinearRegression CON CV
plLinearCV <-test %>% 
  ggplot(aes(medv,pLineerCV)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of medv') +
  ylab('Predicted value of medv')+
  theme_bw()

ggplotly(plLinearCV)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ridge <- train(medv~.,
               data = train,
               method = "glmnet",
               tuneGrid = expand.grid(alpha = 0,
                                      lambda = seq(0.0001,1,length=50)),
               trControl = control )


pRidge <- predict(ridge,test)
postResample(pRidge,test$medv)
##      RMSE  Rsquared       MAE 
## 4.7743466 0.7361985 3.0830773
plRidge <-test %>% 
  ggplot(aes(medv,pRidge)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of medv') +
  ylab('Predicted value of medv')+
  theme_bw()

ggplotly(plRidge)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
lasso <- train(medv~.,
               data = train,
               method = "glmnet",
               tuneGrid = expand.grid(alpha = 1,
                                      lambda = seq(0.0001,1,length=50)),
               trControl = control )


# Test RMSE
pLasso <- predict(lasso,test)

postResample(pLasso,test$medv)
##      RMSE  Rsquared       MAE 
## 4.7777320 0.7393834 3.1644723
#Lasoo 

plLasso <-test %>% 
  ggplot(aes(medv,pLasso)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of medv') +
  ylab('Predicted value of medv')+
  theme_bw()

ggplotly(plLasso)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Separate x and y of train and test dataset, which will very useful when we using this in the catboost package.
y_train <- unlist(train[c('medv')])
X_train <- train %>% select(-medv)
y_valid <- unlist(test[c('medv')])
X_valid <- test %>% select(-medv)

#Convert the train and test dataset to catboost specific format using the load_pool function by mentioning x and y of both train and test.
train_pool <- catboost.load_pool(data = X_train, label = y_train)
test_pool <- catboost.load_pool(data = X_valid, label = y_valid)

#Create an input params for the CatBoost regression.
params <- list(iterations=500,
               learning_rate=0.01,
               depth=10,
               loss_function='RMSE',
               eval_metric='RMSE',
               random_seed = 55,
               od_type='Iter',
               metric_period = 50,
               od_wait=20,
               use_best_model=TRUE)


#Iterations- The maximum number of trees that can be built when solving machine learning problems.
#Build a model using the catboost train function. Pass the train dataset and parameters to the catboost train function.
modelCatboost <- catboost.train(learn_pool = train_pool,params = params)
## You should provide test set for use best model. use_best_model parameter has been switched to false value.
## 0:   learn: 9.0969322    total: 164ms    remaining: 1m 21s
## 50:  learn: 7.0396136    total: 957ms    remaining: 8.42s
## 100: learn: 5.6154938    total: 1.66s    remaining: 6.58s
## 150: learn: 4.6142807    total: 2.39s    remaining: 5.52s
## 200: learn: 3.8905746    total: 3.14s    remaining: 4.67s
## 250: learn: 3.3397315    total: 3.87s    remaining: 3.84s
## 300: learn: 2.9277197    total: 4.59s    remaining: 3.04s
## 350: learn: 2.6162423    total: 5.35s    remaining: 2.27s
## 400: learn: 2.3565605    total: 6.1s remaining: 1.51s
## 450: learn: 2.1541226    total: 6.85s    remaining: 744ms
## 499: learn: 1.9844896    total: 7.59s    remaining: 0us
#predict
y_pred=catboost.predict(modelCatboost,test_pool)

#calculate error metrics
catboostMetrics <- postResample(y_pred,test$medv)
catboostMetrics
##      RMSE  Rsquared       MAE 
## 3.6521302 0.8772542 2.3072133
#Catboost
plCatboost <-test %>% 
  ggplot(aes(medv,y_pred)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of medv') +
  ylab('Predicted value of medv')+
  theme_bw()

ggplotly(plCatboost)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Catboost without correlated variables

corr_dataTEST <- test[,-highCorr]
#corr_data
train_pool2 <- catboost.load_pool(data = corr_data, label = y_train)
test_pool2 <- catboost.load_pool(data = corr_dataTEST, label = y_valid)
#Build a model
modelCatboost2 <- catboost.train(learn_pool = train_pool2,params = params)
## You should provide test set for use best model. use_best_model parameter has been switched to false value.
## 0:   learn: 9.0952492    total: 13.5ms   remaining: 6.75s
## 50:  learn: 6.6649525    total: 568ms    remaining: 5s
## 100: learn: 4.9337798    total: 1.1s remaining: 4.36s
## 150: learn: 3.7255671    total: 1.7s remaining: 3.92s
## 200: learn: 2.8594591    total: 2.21s    remaining: 3.29s
## 250: learn: 2.2309073    total: 2.77s    remaining: 2.75s
## 300: learn: 1.7771642    total: 3.31s    remaining: 2.19s
## 350: learn: 1.4343749    total: 3.87s    remaining: 1.64s
## 400: learn: 1.1869029    total: 4.44s    remaining: 1.09s
## 450: learn: 0.9949640    total: 4.97s    remaining: 540ms
## 499: learn: 0.8452080    total: 5.47s    remaining: 0us
#predict
y_pred2=catboost.predict(modelCatboost2,test_pool2)
catboostMetrics2 <- postResample(y_pred2,test$medv)
catboostMetrics2
##      RMSE  Rsquared       MAE 
## 2.1770161 0.9638678 0.9684344
#plot
plCatboost2 <-test %>% 
  ggplot(aes(medv,y_pred2)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of medv') +
  ylab('Predicted value of medv')+
  theme_bw()

ggplotly(plCatboost2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'